{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loadDataSet():\n",
    "    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\n",
    "                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n",
    "                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n",
    "                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n",
    "                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n",
    "                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]\n",
    "    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not\n",
    "    return postingList,classVec\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def creatVocabList(dataSet):\n",
    "    vocabSet = set([])\n",
    "    for document in dataSet:\n",
    "        vocabSet = vocabSet | set(document)\n",
    "    return list(vocabSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def setOfWords2Vec(vocabList, inputSet):\n",
    "    returnVec = [0]*len(vocabList)\n",
    "    for word in inputSet:\n",
    "        if word in vocabList :\n",
    "            returnVec[vocabList.index(word)] = 1\n",
    "        else:\n",
    "            print('the word :%s is not in my Vocabulary'%word)\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['buying',\n",
       " 'please',\n",
       " 'help',\n",
       " 'mr',\n",
       " 'my',\n",
       " 'has',\n",
       " 'problems',\n",
       " 'worthless',\n",
       " 'take',\n",
       " 'food',\n",
       " 'not',\n",
       " 'how',\n",
       " 'flea',\n",
       " 'dog',\n",
       " 'cute',\n",
       " 'I',\n",
       " 'licks',\n",
       " 'quit',\n",
       " 'steak',\n",
       " 'love',\n",
       " 'stupid',\n",
       " 'maybe',\n",
       " 'so',\n",
       " 'him',\n",
       " 'garbage',\n",
       " 'ate',\n",
       " 'dalmation',\n",
       " 'to',\n",
       " 'park',\n",
       " 'posting',\n",
       " 'stop',\n",
       " 'is']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "listOPosts  , listClasses = loadDataSet()\n",
    "myVocabList = creatVocabList(listOPosts)\n",
    "myVocabList"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7, 8, 8, 5, 9)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(listOPosts[0]) , len(listOPosts[1]),len(listOPosts[2]),len(listOPosts[3]),len(listOPosts[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0,\n",
       " 1,\n",
       " 1,\n",
       " 0,\n",
       " 1,\n",
       " 1,\n",
       " 1,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 1,\n",
       " 1,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0,\n",
       " 0]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "setOfWords2Vec(myVocabList,listOPosts[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def trainNB0(trainMatrix,trainCategory):\n",
    "    numTrainDocs = len(trainMatrix)\n",
    "    numWords = len(trainMatrix[0])\n",
    "    pAbusive = np.sum(trainCategory)/np.float(numTrainDocs)\n",
    "    p0Num = np.ones(numWords)\n",
    "    p1Num = np.ones(numWords)\n",
    "    p0Denom = 1.0\n",
    "    p1Denom = 1.0\n",
    "    for i in  range(numTrainDocs):\n",
    "        if trainCategory[i] == 1:\n",
    "            p1Num += trainMatrix[i]\n",
    "            p1Denom += np.sum(trainMatrix[i])\n",
    "        else:\n",
    "            p0Num += trainMatrix[i]\n",
    "            p0Denom += np.sum(trainMatrix[i])\n",
    "    p1Vect = np.log(p1Num / p1Denom)\n",
    "    p0Vect = np.log(p0Num / p0Denom)\n",
    "    return p0Vect , p1Vect ,pAbusive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainMat = []\n",
    "for postinDoc in listOPosts:\n",
    "    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6, 32)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.array(trainMat).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "p0v , p1v , pAb = trainNB0(trainMat,listClasses)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.5, array([-3.21887582, -2.52572864, -2.52572864, -2.52572864, -1.83258146,\n",
       "        -2.52572864, -2.52572864, -3.21887582, -3.21887582, -3.21887582,\n",
       "        -3.21887582, -2.52572864, -2.52572864, -2.52572864, -2.52572864,\n",
       "        -2.52572864, -2.52572864, -3.21887582, -2.52572864, -2.52572864,\n",
       "        -3.21887582, -3.21887582, -2.52572864, -2.12026354, -3.21887582,\n",
       "        -2.52572864, -2.52572864, -2.52572864, -3.21887582, -3.21887582,\n",
       "        -2.52572864, -2.52572864]), array([-2.30258509, -2.99573227, -2.99573227, -2.99573227, -2.99573227,\n",
       "        -2.99573227, -2.99573227, -1.89711998, -2.30258509, -2.30258509,\n",
       "        -2.30258509, -2.99573227, -2.99573227, -1.89711998, -2.99573227,\n",
       "        -2.99573227, -2.99573227, -2.30258509, -2.99573227, -2.99573227,\n",
       "        -1.60943791, -2.30258509, -2.99573227, -2.30258509, -2.30258509,\n",
       "        -2.99573227, -2.99573227, -2.30258509, -2.30258509, -2.30258509,\n",
       "        -2.30258509, -2.99573227]))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pAb,p0v , p1v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((32,), (32,))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1v.shape , p0v.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def classifyNB(vec2classify , p0Vec,p1Vec,pClass1):\n",
    "    p1 = np.sum(vec2classify*p1Vec) + np.log(pClass1)\n",
    "    p0 = np.sum(vec2classify*p0Vec) + np.log(1.0-pClass1)\n",
    "    if p1 > p0:\n",
    "        return 1\n",
    "    else :\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def testingNB():\n",
    "    lis0Posts , listClasses  = loadDataSet()\n",
    "    myVocabList = creatVocabList(listOPosts)\n",
    "    trainMat = []\n",
    "    for postinDoc in listOPosts:\n",
    "        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))\n",
    "    p0v , p1v , pAb = trainNB0(np.array(trainMat) , np.array(listClasses))\n",
    "    testEntry = ['love' , 'my','dalmation']\n",
    "    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))\n",
    "    print(testEntry,'classified one as :', classifyNB(thisDoc,p0v,p1v,pAb))\n",
    "    testEntry = ['stupid' ,'garbage']\n",
    "    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))\n",
    "    print(testEntry,'classified two as :', classifyNB(thisDoc,p0v,p1v,pAb))\n",
    "       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['love', 'my', 'dalmation'] classified one as : 0\n",
      "['stupid', 'garbage'] classified two as : 1\n"
     ]
    }
   ],
   "source": [
    "testingNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def setOfWords2VecMN(vocabList, inputSet):\n",
    "    returnVec = [0]*len(vocabList)\n",
    "    for word in inputSet:\n",
    "        if word in vocabList :\n",
    "            returnVec[vocabList.index(word)] += 1\n",
    "        else:\n",
    "            print('the word :%s is not in my Vocabulary'%word)\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['This',\n",
       " 'book',\n",
       " 'is',\n",
       " 'the',\n",
       " 'best',\n",
       " 'book',\n",
       " 'on',\n",
       " 'Python',\n",
       " 'or',\n",
       " 'M.L.',\n",
       " 'I',\n",
       " 'have',\n",
       " 'ever',\n",
       " 'laid',\n",
       " 'eyes',\n",
       " 'upon.']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'\n",
    "mySent.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\RUANJIAN\\Anaconda3\\Anaconda3_3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: split() requires a non-empty pattern match.\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['This',\n",
       " 'book',\n",
       " 'is',\n",
       " 'the',\n",
       " 'best',\n",
       " 'book',\n",
       " 'on',\n",
       " 'Python',\n",
       " 'or',\n",
       " 'M',\n",
       " 'L',\n",
       " 'I',\n",
       " 'have',\n",
       " 'ever',\n",
       " 'laid',\n",
       " 'eyes',\n",
       " 'upon',\n",
       " '']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regEx = re.compile('\\\\W*')\n",
    "listOfTokens = regEx.split(mySent)\n",
    "listOfTokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['this',\n",
       " 'book',\n",
       " 'is',\n",
       " 'the',\n",
       " 'best',\n",
       " 'book',\n",
       " 'on',\n",
       " 'python',\n",
       " 'or',\n",
       " 'm',\n",
       " 'l',\n",
       " 'i',\n",
       " 'have',\n",
       " 'ever',\n",
       " 'laid',\n",
       " 'eyes',\n",
       " 'upon']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[tok.lower() for tok in listOfTokens if len(tok) > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\RUANJIAN\\Anaconda3\\Anaconda3_3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: split() requires a non-empty pattern match.\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "emailText = open('email/ham/6.txt').read()\n",
    "listOfTokens = regEx.split(emailText)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def textParse(bigString):\n",
    "    listOfTokens = re.split(r'\\W*',bigString)\n",
    "    return [tok.lower() for tok in listOfTokens if len(tok) > 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def spamTest():\n",
    "    docList = []\n",
    "    classList = []\n",
    "    fullText  = []\n",
    "    for i in range(1,26):\n",
    "        wordList = textParse(open('email/spam/%d.txt'%i).read())\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(1)\n",
    "        wordList = textParse(open('email/ham/%d.txt'%i).read())\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(0)\n",
    "    vocabList = creatVocabList(docList)\n",
    "    trainingSet = list(np.arange(50))\n",
    "    testSet = []\n",
    "    for i in range(10):\n",
    "        randIndex = int(np.random.uniform(0,len(trainingSet)))\n",
    "        testSet.append(trainingSet[randIndex])\n",
    "        del trainingSet[randIndex]\n",
    "    trainMat =[] \n",
    "    trainClasses =[]\n",
    "    for docIndecx in trainingSet:\n",
    "        trainMat.append(setOfWords2VecMN(vocabList ,docList[docIndecx]))\n",
    "        trainClasses.append(classList[docIndecx])\n",
    "        p0v,p1v,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))\n",
    "        errorCount = 0\n",
    "    for docIndecx in testSet:\n",
    "        wordVector = setOfWords2VecMN(vocabList,docList[docIndecx])\n",
    "        if classifyNB(np.array(wordVector),p0v,p1v,pSpam) != classList[docIndecx]:\n",
    "            errorCount += 1\n",
    "    print('the error rate is :',np.float(errorCount) / len(testSet))\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "23.552114897010235"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.uniform(0,50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate is : 0.1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\RUANJIAN\\Anaconda3\\Anaconda3_3\\lib\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    }
   ],
   "source": [
    "spamTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\RUANJIAN\\Anaconda3\\Anaconda3_3\\lib\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate is : 0.0\n"
     ]
    }
   ],
   "source": [
    "spamTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate is : 0.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\RUANJIAN\\Anaconda3\\Anaconda3_3\\lib\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    }
   ],
   "source": [
    "spamTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import feedparser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['updated', 'version', 'namespaces', 'updated_parsed', 'href', 'bozo', 'bozo_exception', 'encoding', 'feed', 'headers', 'entries', 'status'])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ny.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ny['entries'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import operator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def calcMostFreq(vocabList,fullText):\n",
    "    freqDict = {}\n",
    "    for token in vocabList:\n",
    "        freqDict[token] = fullText.count(token)\n",
    "    sortedFreq = sorted(freqDict.itmes(),\n",
    "                       key = operator.itemgetter(1),\n",
    "                       reverse =True)\n",
    "    return sortedFreq[:30]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def localWords(feed1 , feed0):\n",
    "    docList = []\n",
    "    classList = []\n",
    "    fullText  = []\n",
    "    minLen = np.min(len(feed1['feed']),len(feed0['feed']))\n",
    "    for i in range(minLen):\n",
    "        wordList = textParse(feed1['feed'][i]['summary'])\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(1)\n",
    "        wordList = textParse(feed0['feed'][i]['summary'])\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(0)\n",
    "    vocabList = creatVocabList(docList)\n",
    "    top30Words = calcMostFreq(vocabList,fullText)\n",
    "    for pairW in top30Words:\n",
    "        if pairW[0] in vocabList :\n",
    "            vocabList.remove(pairW[0])\n",
    "    trainingSet = np.arange(2*minLen)\n",
    "    testSet = []\n",
    "    for i in range(20):\n",
    "        randIndex = int(np.random.uniform(0,len(trainingSet)))\n",
    "        testSet.append(trainingSet[randIndex])\n",
    "        del trainingSet[randIndex]\n",
    "    trainMat =[] \n",
    "    trainClasses =[]\n",
    "    for docIndecx in trainingSet:\n",
    "        trainMat.append(setOfWords2VecMN(vocabList ,docList[docIndecx]))\n",
    "        trainClasses.append(classList[docIndecx])\n",
    "        p0v,p1v,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))\n",
    "        errorCount = 0\n",
    "    for docIndecx in testSet:\n",
    "        wordVector = setOfWords2VecMN(vocabList,docList[docIndecx])\n",
    "        if classifyNB(np.array(wordVector),p0v,p1v,pSpam) != classList[docIndecx]:\n",
    "            errorCount += 1\n",
    "    print('the error rate is :',np.float(errorCount) / len(testSet))\n",
    "    return vocabList , p0v,p1v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')\n",
    "sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# vocabList,pSF,pNY= localWords(ny,sf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def getTopWords(ny,sf):\n",
    "    vocavList , p0v ,p1v = localWords(ny,sf)\n",
    "    topNy = []\n",
    "    topSF = []\n",
    "    for i in range(len(p0v)):\n",
    "        if p0v[i] > -6.0 : topSF.append((vocavList[i],p0v[i]))\n",
    "        if p1v[i] > -6.0 : topSF.append((vocavList[i],p1v[i]))\n",
    "    sortedSF = sorted(topSF ,\n",
    "                     key=lambda pair : pair[1] ,reverse = True)\n",
    "    print('________SF_______')\n",
    "    for item in sorted :\n",
    "        print(item[0])\n",
    "    sortedNY = sorted(topNy , key=lambda pair:pair[i],reverse = True)\n",
    "    print('________ny_______')\n",
    "    for item in sortedNY:\n",
    "        print(item[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
